Click to show code
print('hello')
#> [1] "hello"The following machine learning project focuses on…
print('hello')
#> [1] "hello"# Python code
import numpy as np
print(np.mean([10, 20, 30, 40, 50]))
#> 30.0properties <- read.csv(file.path(here(),"data/properties.csv"))
# show 1000 first rows of properties using reactable
reactable(head(properties, 1000))
# Create a tibble with cantons and observations
observations_table <- tibble(
Canton = c("Vaud", "Bern", "Lucerne", "Zurich", "Uri", "Schwyz",
"Obwalden", "Nidwalden", "Glarus", "St. Gallen", "Grisons",
"Aargau", "Thurgau", "Ticino", "Valais", "Neuchatel",
"Geneva", "Jura", "Zug", "Fribourg", "Solothurn",
"Basel-Stadt", "Basel-Landschaft", "Schaffhausen",
"Appenzell-Ausser-Rhoden", "Appenzell-Inner-Rhoden", "Total"),
Observations = c(3232, 1553, 376, 1191, 71, 93, 29, 51, 55, 757, 405,
1481, 553, 4230, 3601, 513, 629, 329, 69, 1242, 590,
149, 705, 118, 102, 12, sum(c(3232, 1553, 376, 1191, 71, 93, 29, 51, 55, 757, 405,
1481, 553, 4230, 3601, 513, 629, 329, 69, 1242, 590,
149, 705, 118, 102, 12)))
)
# Display the table using kable and kableExtra
observations_table %>%
kbl(caption = "Number of Observations by Canton") %>%
kable_styling(position = "center", bootstrap_options = c("striped", "bordered", "hover")) %>%
add_header_above(c(" " = 1, "Observations" = 1)) # Adds headers spanning columns| Canton | Observations |
|---|---|
| Vaud | 3232 |
| Bern | 1553 |
| Lucerne | 376 |
| Zurich | 1191 |
| Uri | 71 |
| Schwyz | 93 |
| Obwalden | 29 |
| Nidwalden | 51 |
| Glarus | 55 |
| St. Gallen | 757 |
| Grisons | 405 |
| Aargau | 1481 |
| Thurgau | 553 |
| Ticino | 4230 |
| Valais | 3601 |
| Neuchatel | 513 |
| Geneva | 629 |
| Jura | 329 |
| Zug | 69 |
| Fribourg | 1242 |
| Solothurn | 590 |
| Basel-Stadt | 149 |
| Basel-Landschaft | 705 |
| Schaffhausen | 118 |
| Appenzell-Ausser-Rhoden | 102 |
| Appenzell-Inner-Rhoden | 12 |
| Total | 22136 |
# Identify values causing the issue
problematic_values <- properties$number_of_rooms[is.na(as.numeric(properties$number_of_rooms))]
#> Warning: NAs introduced by coercion
# Replace non-numeric values with NA
properties$number_of_rooms <- as.numeric(gsub("[^0-9.]", "", properties$number_of_rooms))
# Remove non-numeric characters and convert to numeric
properties$price <- as.numeric(gsub("[^0-9]", "", properties$price))
# Subset the dataset to exclude rows with price < 20000
properties <- properties[properties$price >= 20000, ]
# Subset the dataset to exclude rows with numbers of rooms < 25
properties <- properties[properties$number_of_rooms <25, ]
# Replace incomplete addresses
properties$address <- gsub("^\\W*[.,0-]\\W*", "", properties$address)
properties_filtered <- na.omit(properties)
properties_filtered$year_category <- substr(properties_filtered$year_category, 1, 9)
# Assuming 'year_category' is a column in the 'properties' dataset
properties_filtered$year_category <- as.factor(properties_filtered$year_category)
# Preprocess the number_of_rooms column
properties_filtered$number_of_rooms <- as.character(properties_filtered$number_of_rooms)
properties_filtered$number_of_rooms <- gsub("\\D", "", properties_filtered$number_of_rooms) # Remove non-numeric characters
properties_filtered$number_of_rooms <- as.numeric(properties_filtered$number_of_rooms) # Convert to numeric
properties_filtered$number_of_rooms <- trunc(properties_filtered$number_of_rooms) # Truncate non-integer values
# remove m^2 from column 'square_meters'
properties_filtered$square_meters <- as.numeric(gsub("\\D", "", properties_filtered$square_meters))
# print how many NA observations left in square_meters
print(sum(is.na(properties_filtered$square_meters)))
#> [1] 988
# remove NA
properties_filtered <- properties_filtered[!is.na(properties_filtered$square_meters),]
# add majuscule to canton
properties_filtered$canton <- tools::toTitleCase(properties_filtered$canton)
# show 100 first row of cleaned dataset using reactable
reactable(head(properties_filtered, 100))df <- properties_filtered
#the address column is like : '1844 Villeneuve VD' and has zip code number in it
#taking out the zip code number and creating a new column 'zip_code'
#the way to identify the zip code is to identify numbers that are 4 digits long
df$zip_code <- as.numeric(gsub("\\D", "", df$address))
#removing the first two number of zip code has more than 4 number
df$zip_code <- ifelse(df$zip_code > 9999, df$zip_code %% 10000, df$zip_code)#read .csv AMTOVZ_CSV_LV95
amto <- read.csv(file.path(here(),"data/AMTOVZ_CSV_LV95.csv"), sep = ";")
#creating a new dataframe with 'Ortschaftsname' as 'City'Place_name', 'PLZ' as 'zip_code' and 'KantonskÃ.rzel' as 'Canton_code'
amto_df <- data.frame(City=amto$Ortschaftsname, zip_code=amto$PLZ, Canton_code=amto$Kantonskürzel)
# display 100 first rows of atmo_df using reactable
reactable::reactable(head(amto_df, 1000))#merge the two dataframes 'df' and 'amto_df' on 'zip_code'
df <- merge(df, amto_df, by='zip_code', all.x=TRUE)
#check if there are nan in city
df[is.na(df$City),]
#> zip_code price number_of_rooms square_meters
#> 1 25 2200000 65 165
#> 2 25 2200000 10 263
#> 3 26 655000 35 66
#> 4 26 1995000 75 180
#> 5 322 870000 25 59
#> 6 322 880000 25 55
#> 7 322 975000 35 56
#> 336 1014 1510000 55 146
#> 2127 1200 16092000 7 400
#> 2128 1200 3285450 5 230
#> 2129 1200 679000 55 142
#> 11521 1919 785000 35 103
#> 11522 1919 1908000 65 210
#> 11523 1919 2558620 55 270
#> 11524 1919 1065000 45 130
#> 17583 2500 1100000 5 154
#> 17584 2500 420000 45 115
#> 17585 2500 885500 55 130
#> 17586 2500 872500 45 144
#> 17587 2500 872500 45 138
#> 17588 2500 887500 55 130
#> 17589 2500 870500 45 125
#> 17590 2500 892500 45 144
#> 17591 2500 885500 55 130
#> 17592 2500 1050000 45 121
#> 17593 2500 877500 45 138
#> 17594 2500 887500 45 144
#> 17595 2500 1450000 55 198
#> 19178 3000 820000 55 165
#> 19179 3000 1140000 35 115
#> 19180 3000 1090000 35 115
#> 19181 3000 1090000 55 193
#> 19182 3000 920000 45 157
#> 19183 3000 1090000 55 193
#> 19184 3000 1590000 55 330
#> 19185 3000 720000 35 102
#> 19186 3000 920000 45 157
#> 26756 4000 180000 3 70
#> 26757 4000 975000 45 125
#> 26758 4000 2100000 65 360
#> 30233 5201 725000 35 95
#> 31403 6000 695000 45 133
#> 33026 6511 440000 2 64
#> 33433 6547 15000000 75 220
#> 34562 6602 2800000 75 242
#> 34563 6602 2800000 65 250
#> 34564 6602 270000 15 28
#> 34565 6602 450000 35 75
#> 34566 6604 1990000 45 220
#> 34567 6604 2668590 55 290
#> 34568 6604 760000 35 78
#> 39888 6901 3660930 45 290
#> 39889 6901 3660930 45 290
#> 39890 6903 790000 35 105
#> 39891 6907 995000 45 114
#> 39892 6907 995000 45 114
#> 39893 6911 469350 55 140
#> 39894 6911 610000 35 103
#> 39895 6911 660000 75 200
#> 39896 6911 737550 45 82
#> 41884 7133 2266290 55 160
#> 41901 7135 2690000 85 236
#> 42256 8000 2100000 45 152
#> 42257 8000 1650000 45 142
#> 42258 8000 925000 35 102
#> 42259 8000 1650000 45 142
#> 42260 8000 1150000 45 128
#> 42261 8000 1450000 55 143
#> 42262 8000 1990000 55 200
#> 42263 8000 1990000 55 200
#> 42264 8000 975000 45 122
#> 42265 8000 2495000 55 482
#> 43220 8238 245000 2 49
#> 43955 8423 2110000 65 204
#> 43956 8423 2190000 55 167
#> 46558 9241 545000 45 100
#> 46559 9241 730840 55 130
#> address
#> 1 1000 Lausanne 25
#> 2 1000 Lausanne 25
#> 3 1000 Lausanne 26
#> 4 Lausanne 26, 1000 Lausanne 26
#> 5 Via Cuolm Liung 30d, 7032 Laax GR 2
#> 6 7032 Laax GR 2
#> 7 Via Murschetg 29, 7032 Laax GR 2
#> 336 1014 Lausanne
#> 2127 1200 Genève
#> 2128 1200 Genève
#> 2129 Chemin des pralets, 74100 Etrembières, 1200 Genève
#> 11521 1919 Martigny
#> 11522 1919 Martigny
#> 11523 1919 Martigny
#> 11524 1919 Martigny
#> 17583 2500 Biel/Bienne
#> 17584 2500 Biel/Bienne
#> 17585 2500 Biel/Bienne
#> 17586 2500 Biel/Bienne
#> 17587 2500 Biel/Bienne
#> 17588 2500 Biel/Bienne
#> 17589 2500 Biel/Bienne
#> 17590 2500 Biel/Bienne
#> 17591 2500 Biel/Bienne
#> 17592 Hohlenweg 11b, 2500 Biel/Bienne
#> 17593 2500 Biel/Bienne
#> 17594 2500 Biel/Bienne
#> 17595 2500 Bienne
#> 19178 3000 Bern
#> 19179 3000 Bern
#> 19180 3000 Bern
#> 19181 3000 Bern
#> 19182 3000 Bern
#> 19183 3000 Bern
#> 19184 3000 Bern
#> 19185 3000 Bern
#> 19186 3000 Bern
#> 26756 Lörrach Brombach Steinsack 6, 4000 Basel
#> 26757 4000 Basel
#> 26758 4000 Basel
#> 30233 5201 Brugg AG
#> 31403 in TRIENGEN, ca. 20 min. bei Luzern, 6000 Luzern
#> 33026 6511 Cadenazzo
#> 33433 Augio 1F, 6547 Augio
#> 34562 6602 Muralto
#> 34563 6602 Muralto
#> 34564 6602 Muralto
#> 34565 Via Bacilieri 2, 6602 Muralto
#> 34566 6604 Solduno
#> 34567 6604 Solduno
#> 34568 6604 Locarno
#> 39888 6901 Lugano
#> 39889 6901 Lugano
#> 39890 6903 Lugano
#> 39891 6907 MASSAGNO
#> 39892 6907 MASSAGNO
#> 39893 6911 Campione d'Italia
#> 39894 6911 Campione d'Italia
#> 39895 6911 Campione d'Italia
#> 39896 6911 Campione d'Italia
#> 41884 Inder Platenga 34, 7133 Obersaxen
#> 41901 7135 Fideris
#> 42256 8000 Zürich
#> 42257 8000 Zürich
#> 42258 8000 Zürich
#> 42259 8000 Zürich
#> 42260 8000 Zürich
#> 42261 8000 Zürich
#> 42262 8000 Zürich
#> 42263 8000 Zürich
#> 42264 8000 Zürich
#> 42265 8000 Zürich
#> 43220 Stemmerstrasse 14, 8238 Büsingen am Hochrhein
#> 43955 Chüngstrasse 60, 8423 Embrach
#> 43956 Chüngstrasse 48, 8423 Embrach
#> 46558 9241 Kradolf
#> 46559 9241 Kradolf
#> canton property_type floor year_category City
#> 1 Vaud Villa 2006-2010 <NA>
#> 2 Vaud Single house 1919-1945 <NA>
#> 3 Vaud Apartment noteg 2016-2024 <NA>
#> 4 Vaud Villa 1961-1970 <NA>
#> 5 Grisons Apartment eg 2016-2024 <NA>
#> 6 Grisons Apartment noteg 2016-2024 <NA>
#> 7 Grisons Apartment noteg 2011-2015 <NA>
#> 336 Vaud Apartment eg 2011-2015 <NA>
#> 2127 Geneva Single house 2011-2015 <NA>
#> 2128 Geneva Bifamiliar house 1981-1990 <NA>
#> 2129 Geneva Bifamiliar house 2016-2024 <NA>
#> 11521 Valais Apartment noteg 2016-2024 <NA>
#> 11522 Valais Apartment noteg 2016-2024 <NA>
#> 11523 Valais Attic flat noteg 2016-2024 <NA>
#> 11524 Valais Apartment noteg 2016-2024 <NA>
#> 17583 Bern Single house 2001-2005 <NA>
#> 17584 Bern Apartment noteg 1971-1980 <NA>
#> 17585 Bern Villa 2016-2024 <NA>
#> 17586 Bern Villa 2016-2024 <NA>
#> 17587 Bern Single house 2016-2024 <NA>
#> 17588 Bern Single house 2016-2024 <NA>
#> 17589 Bern Single house 2016-2024 <NA>
#> 17590 Bern Single house 2016-2024 <NA>
#> 17591 Bern Single house 2016-2024 <NA>
#> 17592 Bern Single house 2001-2005 <NA>
#> 17593 Bern Single house 2016-2024 <NA>
#> 17594 Bern Single house 2016-2024 <NA>
#> 17595 Bern Single house 2016-2024 <NA>
#> 19178 Bern Apartment noteg 2016-2024 <NA>
#> 19179 Bern Apartment eg 2016-2024 <NA>
#> 19180 Bern Apartment eg 2016-2024 <NA>
#> 19181 Bern Roof flat noteg 2016-2024 <NA>
#> 19182 Bern Apartment noteg 2016-2024 <NA>
#> 19183 Bern Apartment noteg 2016-2024 <NA>
#> 19184 Bern Apartment noteg 1991-2000 <NA>
#> 19185 Bern Apartment eg 2016-2024 <NA>
#> 19186 Bern Duplex noteg 2016-2024 <NA>
#> 26756 Basel-Stadt Single house 1961-1970 <NA>
#> 26757 Basel-Stadt Single house 2016-2024 <NA>
#> 26758 Basel-Stadt Villa 2016-2024 <NA>
#> 30233 Aargau Apartment noteg 2016-2024 <NA>
#> 31403 Lucerne Apartment noteg 1991-2000 <NA>
#> 33026 Ticino Apartment noteg 2016-2024 <NA>
#> 33433 Grisons Single house 2016-2024 <NA>
#> 34562 Ticino Single house 1981-1990 <NA>
#> 34563 Ticino Single house 1981-1990 <NA>
#> 34564 Ticino Apartment eg 1961-1970 <NA>
#> 34565 Ticino Apartment noteg 1946-1960 <NA>
#> 34566 Ticino Attic flat noteg 2011-2015 <NA>
#> 34567 Ticino Apartment noteg 2011-2015 <NA>
#> 34568 Ticino Apartment noteg 2011-2015 <NA>
#> 39888 Ticino Attic flat noteg 2011-2015 <NA>
#> 39889 Ticino Apartment noteg 2011-2015 <NA>
#> 39890 Ticino Apartment noteg 2006-2010 <NA>
#> 39891 Ticino Apartment noteg 2016-2024 <NA>
#> 39892 Ticino Apartment noteg 2016-2024 <NA>
#> 39893 Ticino Apartment noteg 1946-1960 <NA>
#> 39894 Ticino Apartment eg 1946-1960 <NA>
#> 39895 Ticino Single house 1971-1980 <NA>
#> 39896 Ticino Apartment noteg 1991-2000 <NA>
#> 41884 Grisons Single house 2006-2010 <NA>
#> 41901 Grisons Single house 0-1919 <NA>
#> 42256 Zurich Apartment noteg 2016-2024 <NA>
#> 42257 Zurich Attic flat noteg 2016-2024 <NA>
#> 42258 Zurich Apartment noteg 2016-2024 <NA>
#> 42259 Zurich Apartment noteg 2016-2024 <NA>
#> 42260 Zurich Apartment noteg 2016-2024 <NA>
#> 42261 Zurich Apartment eg 2016-2024 <NA>
#> 42262 Zurich Apartment noteg 2006-2010 <NA>
#> 42263 Zurich Attic flat noteg 2006-2010 <NA>
#> 42264 Zurich Single house 2016-2024 <NA>
#> 42265 Zurich Apartment noteg 0-1919 <NA>
#> 43220 Schaffhausen Apartment noteg 1961-1970 <NA>
#> 43955 Zurich Bifamiliar house 2016-2024 <NA>
#> 43956 Zurich Single house 2016-2024 <NA>
#> 46558 Thurgau Apartment noteg 1991-2000 <NA>
#> 46559 Thurgau Apartment noteg 1991-2000 <NA>
#> Canton_code
#> 1 <NA>
#> 2 <NA>
#> 3 <NA>
#> 4 <NA>
#> 5 <NA>
#> 6 <NA>
#> 7 <NA>
#> 336 <NA>
#> 2127 <NA>
#> 2128 <NA>
#> 2129 <NA>
#> 11521 <NA>
#> 11522 <NA>
#> 11523 <NA>
#> 11524 <NA>
#> 17583 <NA>
#> 17584 <NA>
#> 17585 <NA>
#> 17586 <NA>
#> 17587 <NA>
#> 17588 <NA>
#> 17589 <NA>
#> 17590 <NA>
#> 17591 <NA>
#> 17592 <NA>
#> 17593 <NA>
#> 17594 <NA>
#> 17595 <NA>
#> 19178 <NA>
#> 19179 <NA>
#> 19180 <NA>
#> 19181 <NA>
#> 19182 <NA>
#> 19183 <NA>
#> 19184 <NA>
#> 19185 <NA>
#> 19186 <NA>
#> 26756 <NA>
#> 26757 <NA>
#> 26758 <NA>
#> 30233 <NA>
#> 31403 <NA>
#> 33026 <NA>
#> 33433 <NA>
#> 34562 <NA>
#> 34563 <NA>
#> 34564 <NA>
#> 34565 <NA>
#> 34566 <NA>
#> 34567 <NA>
#> 34568 <NA>
#> 39888 <NA>
#> 39889 <NA>
#> 39890 <NA>
#> 39891 <NA>
#> 39892 <NA>
#> 39893 <NA>
#> 39894 <NA>
#> 39895 <NA>
#> 39896 <NA>
#> 41884 <NA>
#> 41901 <NA>
#> 42256 <NA>
#> 42257 <NA>
#> 42258 <NA>
#> 42259 <NA>
#> 42260 <NA>
#> 42261 <NA>
#> 42262 <NA>
#> 42263 <NA>
#> 42264 <NA>
#> 42265 <NA>
#> 43220 <NA>
#> 43955 <NA>
#> 43956 <NA>
#> 46558 <NA>
#> 46559 <NA>We have 144 NAN, where
Removed them ::: {.cell layout-align=“center”}
#remove the rows with nan in city
properties_filtered <- df[!is.na(df$City),]
#show the first 100 rows of the cleaned dataset using reactable
reactable(head(properties_filtered, 100)):::
#excel file modified for fribourg and liestal
# read csv
impots <- read.csv(file.path(here(),"data/statistik-steuerfuesse-np-1995-2023-fr.csv"), sep = ",", header = TRUE, stringsAsFactors = FALSE)
# Remove last 13 rows
impots <- head(impots, -14)
# Rename columns
colnames(impots) <- c("Chefs-lieux cantonaux", "Impôt cantonal", "Impôt communal", "Impôt paroissial Evang. / réform", "Impôt paroissial Kath. / cath.rom")
# Set row names to the content of the first column
rownames(impots) <- impots$`Chefs-lieux cantonaux`
# Remove the first column (which is now row names)
impots1 <- impots[, -1, drop = FALSE]
# Set the row names for impots1 the same as impots
rownames(impots1) <- rownames(impots)
impots <- impots1
rm(impots1)
# Write data to CSV based on here()
#write.csv(impots, file.path(here(),"data/impots.csv"), row.names = TRUE)
# show 100 first rows of impots using reactable
reactable(head(impots, 100))# # Create a copy of properties_filtered
# properties_merged <- properties_filtered
#
# # Loop through unique canton names in properties_filtered
# for (canton_name in unique(properties_filtered$canton)) {
# # Check if canton_name exists as a row name in impots
# if (canton_name %in% rownames(impots)) {
# # Get the corresponding row from impots
# impots_row <- impots[canton_name, ]
#
# # Add columns to properties_merged with impots content
# for (col_name in names(impots)) {
# properties_merged[properties_merged$canton == canton_name, paste0(col_name, "_impots")] <- impots_row[[col_name]]
# }
#
# # Remove impots_row from memory
# rm(impots_row)
# }
# }
#
# # Write data to CSV based on here()
# write.csv(properties_merged, file.path(here(),"data/properties_merged.csv"), row.names = TRUE)
# properties_filtered <- properties_merged
# # show 100 first rows of properties_merged using reactable
# reactable(head(properties_merged, 100))Replaces NAs in both Taux de couverture social and Political (Conseil National Datas) For Taux de couverture Social: NAs were due to reason “Q” = “Not indicated to protect confidentiality” We replaced the NAs by the average taux de couverture in Switzerland in 2019, which was 3.2%
For Political data: NAs were due to reason “M” = “Not indicated because data was not important or applicable” Therefore, we replaced the NAs by 0
# il faudra changer le path
commune_prep <- read.csv(file.path(here(),"data/commune_data.csv"), sep = ";", header = TRUE, stringsAsFactors = FALSE)
# We keep only 2019 to have some reference? (2020 is apparently not really complete)
commune_2019 <- subset(commune_prep, PERIOD_REF == "2019") %>%
select(c("REGION", "CODE_REGION", "INDICATORS", "VALUE", "STATUS"))
# delete les lignes ou Status = Q ou M (pas de valeur) et ensuite on enlève la colonne
commune_2019 <- subset(commune_2019, STATUS == "A") %>%
select(c("REGION", "CODE_REGION", "INDICATORS", "VALUE"))
# on enlève les lignes qui sont des aggrégats
commune_2019 <- subset(commune_2019, REGION != "Schweiz")
commune_2019 <- commune_2019 %>%
pivot_wider(names_from = INDICATORS, values_from = VALUE)
# Rename columns using the provided map
df_commune <- commune_2019 %>%
rename(`Population - Habitants` = Ind_01_01,
`Population - Densité de la population` = Ind_01_03,
`Population - Etrangers` = Ind_01_08,
`Population - Part du groupe d'âge 0-19 ans` = Ind_01_04,
`Population - Part du groupe d'âge 20-64 ans` = Ind_01_05,
`Population - Part du groupe d'âge 65+ ans` = Ind_01_06,
`Population - Taux brut de nuptialité` = Ind_01_09,
`Population - Taux brut de divortialité` = Ind_01_10,
`Population - Taux brut de natalité` = Ind_01_11,
`Population - Taux brut de mortalité` = Ind_01_12,
`Population - Ménages privés` = Ind_01_13,
`Population - Taille moyenne des ménages` = Ind_01_14,
`Sécurité sociale - Taux d'aide sociale` = Ind_11_01,
`Conseil national - PLR` = Ind_14_01,
`Conseil national - PDC` = Ind_14_02,
`Conseil national - PS` = Ind_14_03,
`Conseil national - UDC` = Ind_14_04,
`Conseil national - PEV/PCS` = Ind_14_05,
`Conseil national - PVL` = Ind_14_06,
`Conseil national - PBD` = Ind_14_07,
`Conseil national - PST/Sol.` = Ind_14_08,
`Conseil national - PES` = Ind_14_09,
`Conseil national - Petits partis de droite` = Ind_14_10)
# If no one voted for a party, set as NA -> replacing it with 0 instead
df_commune <- df_commune %>%
mutate_at(vars(starts_with("Conseil national")), ~replace_na(., 0))
# Removing NAs from Taux de couverture sociale column
# Setting the mean as the mean for Switzerland in 2019 (3.2%)
mean_taux_aide_social <- 3.2
# Replace NA values with the mean
df_commune <- df_commune %>%
mutate(`Sécurité sociale - Taux d'aide sociale` = if_else(is.na(`Sécurité sociale - Taux d'aide sociale`), mean_taux_aide_social, `Sécurité sociale - Taux d'aide sociale`))
#show 100 first rows of df_commune using reactable
reactable(head(df_commune, 100))
# commune_prep <- read.csv(file.path(here(),"data/commune_data.csv"), sep = ";", header = TRUE, stringsAsFactors = FALSE)
#
# # We keep only 2019 to have some reference? (2020 is apparently not really complete)
# commune_2019 <- subset(commune_prep, PERIOD_REF == "2019") %>%
# select(c("REGION", "CODE_REGION", "INDICATORS", "VALUE", "STATUS"))
#
# # delete les lignes ou Status = Q ou M (pas de valeur) et ensuite on enlève la colonne
# commune_2019 <- subset(commune_2019, STATUS == "A") %>%
# select(c("REGION", "CODE_REGION", "INDICATORS", "VALUE"))
#
# # on enlève les lignes qui sont des aggrégats
# commune_2019 <- subset(commune_2019, REGION != "Schweiz")
#
# commune_2019 <- commune_2019 %>%
# pivot_wider(names_from = INDICATORS, values_from = VALUE)
#
# # Rename columns using the provided map
# df_commune <- commune_2019 %>%
# rename(`Population - Habitants` = Ind_01_01,
# `Population - Densité de la population` = Ind_01_03,
# `Population - Etrangers` = Ind_01_08,
# `Population - Part du groupe d'âge 0-19 ans` = Ind_01_04,
# `Population - Part du groupe d'âge 20-64 ans` = Ind_01_05,
# `Population - Part du groupe d'âge 65+ ans` = Ind_01_06,
# `Population - Taux brut de nuptialité` = Ind_01_09,
# `Population - Taux brut de divortialité` = Ind_01_10,
# `Population - Taux brut de natalité` = Ind_01_11,
# `Population - Taux brut de mortalité` = Ind_01_12,
# `Population - Ménages privés` = Ind_01_13,
# `Population - Taille moyenne des ménages` = Ind_01_14,
# `Sécurité sociale - Taux d'aide sociale` = Ind_11_01,
# `Conseil national - PLR` = Ind_14_01,
# `Conseil national - PDC` = Ind_14_02,
# `Conseil national - PS` = Ind_14_03,
# `Conseil national - UDC` = Ind_14_04,
# `Conseil national - PEV/PCS` = Ind_14_05,
# `Conseil national - PVL` = Ind_14_06,
# `Conseil national - PBD` = Ind_14_07,
# `Conseil national - PST/Sol.` = Ind_14_08,
# `Conseil national - PES` = Ind_14_09,
# `Conseil national - Petits partis de droite` = Ind_14_10)
#
# # If no one voted for a party, set as NA -> replacing it with 0 instead
# df_commune <- df_commune %>%
# mutate_at(vars(starts_with("Conseil national")), ~replace_na(., 0))
#
#
# # Removing NAs from Taux de couverture sociale column
# # Setting the mean as the mean for Switzerland in 2019 (3.2%)
# mean_taux_aide_social <- 3.2
#
# # Replace NA values with the mean
# df_commune <- df_commune %>%
# mutate(`Sécurité sociale - Taux d'aide sociale` = if_else(is.na(`Sécurité sociale - Taux d'aide sociale`), mean_taux_aide_social, `Sécurité sociale - Taux d'aide sociale`))
# histogram_price <- ggplot(properties_filtered, aes(x = price)) +
geom_histogram(binwidth = 100000, fill = "skyblue", color = "red") +
labs(title = "Distribution of Prices",
x = "Price",
y = "Frequency") +
theme_minimal()
# Convert ggplot object to plotly object
interactive_histogram_price <- ggplotly(histogram_price, width = 600, height = 400 )
# Display the interactive histogram
interactive_histogram_pricenote : only price between 0 and 500000 so some outliers aren’t here
# Create the ggplot object
histogram <- ggplot(properties_filtered, aes(x = price)) +
geom_histogram(binwidth = 100000, fill = "skyblue", color = "black") +
facet_wrap(~ property_type, scales = "free", ncol = 2) +
labs(title = "Distribution of Prices by Property Type",
x = "Price",
y = "Frequency") +
theme_minimal() +
xlim(0, 5000000)
# Convert ggplot object to plotly object
interactive_histogram <- ggplotly(histogram, width = 600, height = 1000)
# Display the interactive plot
interactive_histogramnote : only price between 0 and 500000 so some outliers aren’t here
# Create a histogram of prices for each year category
histogram <- ggplot(properties_filtered, aes(x = price)) +
geom_histogram(binwidth = 100000, fill = "skyblue", color = "black") +
facet_wrap(~ year_category, scales = "free", ncol = 2) +
labs(title = "Distribution of Prices by Year Category",
x = "Price",
y = "Frequency") +
theme_minimal() +
xlim(0, 5000000)
# Convert ggplot object to plotly object
interactive_histogram_year <- ggplotly(histogram, width = 600, height = 1000)
# Display the interactive plot
interactive_histogram_yearnote : only price between 0 and 500000 so some outliers aren’t here
histogram <- ggplot(properties_filtered, aes(x = price)) +
geom_histogram(binwidth = 100000, fill = "skyblue", color = "black") +
facet_wrap(~ canton, scales = "free", ncol = 2) +
labs(title = "Distribution of Prices by Canton",
x = "Price",
y = "Frequency") +
theme_minimal() +
xlim(0, 5000000)
# Convert ggplot object to plotly object with adjusted height
interactive_histogram <- ggplotly(histogram, width = 600, height = 1000) %>%
layout(height = 1000) # Adjust the height as needed
# Display the interactive plot
interactive_histogramnote : only price between 0 and 500000 so some outliers aren’t here
and the graph below only show apartments with less than 10 rooms (but you can change the code if needed
properties_room <- properties_filtered[properties_filtered$number_of_rooms < 20, ] # Filter only number_of_rooms less than 20
# Create a histogram of prices for each number of rooms
histogram <- ggplot(properties_room, aes(x = price)) +
geom_histogram(binwidth = 100000, fill = "skyblue", color = "black") +
facet_wrap(~ number_of_rooms, scales = "free", ncol = 2) +
labs(title = "Distribution of Prices by Number of Rooms",
x = "Price",
y = "Frequency") +
theme_minimal() +
xlim(0, 5000000)
# Convert ggplot object to plotly object with adjusted height
interactive_histogram <- ggplotly(histogram, width = 600, height = 1000)
# Display the interactive plot
interactive_histogram# colnames(properties_filtered)[(ncol(properties_filtered) - 3):ncol(properties_filtered)] <- gsub("\\s+", "_", colnames(properties_filtered)[(ncol(properties_filtered) - 3):ncol(properties_filtered)])
#
# # Create a scatter plot to visualize correlation between price and Impôt cantonal
# scatter_plot <- ggplot(properties_filtered, aes(x = price, y = Impôt_cantonal_impots)) +
# geom_point() +
# labs(title = "Correlation between Price and Impôt cantonal",
# x = "Price",
# y = "Impôt cantonal") +
# theme_minimal()
#
# # Convert ggplot object to plotly object
# interactive_plot <- ggplotly(scatter_plot)
#
# # Display the interactive plot
# interactive_plot# Perform multiple linear regression
model <- lm(price ~ number_of_rooms + square_meters + canton + property_type + year_category, data = properties_filtered)
# Summarize the regression model
summary(model)
#>
#> Call:
#> lm(formula = price ~ number_of_rooms + square_meters + canton +
#> property_type + year_category, data = properties_filtered)
#>
#> Residuals:
#> Min 1Q Median 3Q Max
#> -12863879 -318837 -59535 211251 16094914
#>
#> Coefficients:
#> Estimate Std. Error t value Pr(>|t|)
#> (Intercept) -571974.1 29124.7 -19.64 < 2e-16
#> number_of_rooms -664.1 232.1 -2.86 0.00422
#> square_meters 9670.6 48.7 198.68 < 2e-16
#> cantonAppenzell-Ausser-Rhoden -49354.9 58483.4 -0.84 0.39872
#> cantonAppenzell-Inner-Rhoden 119076.8 119190.4 1.00 0.31778
#> cantonBasel-Landschaft 119188.9 32646.7 3.65 0.00026
#> cantonBasel-Stadt 550598.5 75191.0 7.32 2.5e-13
#> cantonBern -143302.0 25791.4 -5.56 2.8e-08
#> cantonFribourg -138139.4 26509.2 -5.21 1.9e-07
#> cantonGeneva 1369129.7 37877.4 36.15 < 2e-16
#> cantonGlarus -257327.7 97006.6 -2.65 0.00799
#> cantonGrisons 236239.9 45281.4 5.22 1.8e-07
#> cantonJura -634396.0 43838.1 -14.47 < 2e-16
#> cantonLucerne 127650.9 37351.1 3.42 0.00063
#> cantonNeuchatel -227592.3 35855.3 -6.35 2.2e-10
#> cantonNidwalden 446997.2 92461.7 4.83 1.3e-06
#> cantonObwalden 813963.3 115379.6 7.05 1.8e-12
#> cantonSchaffhausen -118840.0 69881.8 -1.70 0.08903
#> cantonSchwyz 430523.3 73325.5 5.87 4.3e-09
#> cantonSolothurn -196911.0 35907.3 -5.48 4.2e-08
#> cantonSt-Gallen -54343.2 30354.3 -1.79 0.07341
#> cantonThurgau -83863.1 32028.8 -2.62 0.00884
#> cantonTicino 51018.3 22919.1 2.23 0.02602
#> cantonUri 116303.2 86335.9 1.35 0.17795
#> cantonValais -43447.8 22514.3 -1.93 0.05364
#> cantonVaud 374336.6 23673.3 15.81 < 2e-16
#> cantonZug 1333285.5 62892.0 21.20 < 2e-16
#> cantonZurich 431451.8 29063.2 14.85 < 2e-16
#> property_typeAttic flat 71410.8 22532.7 3.17 0.00153
#> property_typeBifamiliar house -287182.6 22370.6 -12.84 < 2e-16
#> property_typeChalet 229320.1 24921.8 9.20 < 2e-16
#> property_typeDuplex -213772.6 29049.9 -7.36 1.9e-13
#> property_typeFarm house -463341.0 61245.2 -7.57 3.9e-14
#> property_typeLoft -193955.1 156278.0 -1.24 0.21458
#> property_typeRoof flat -112127.9 33006.5 -3.40 0.00068
#> property_typeRustic house -60336.5 124202.5 -0.49 0.62712
#> property_typeSingle house -224374.4 12767.8 -17.57 < 2e-16
#> property_typeTerrace flat 29723.7 46230.7 0.64 0.52026
#> property_typeVilla -46324.0 20056.1 -2.31 0.02091
#> year_category1919-1945 248287.6 32007.7 7.76 8.9e-15
#> year_category1946-1960 281495.6 29638.4 9.50 < 2e-16
#> year_category1961-1970 238844.7 24197.2 9.87 < 2e-16
#> year_category1971-1980 314981.2 21909.2 14.38 < 2e-16
#> year_category1981-1990 269024.8 22265.8 12.08 < 2e-16
#> year_category1991-2000 436391.4 23034.2 18.95 < 2e-16
#> year_category2001-2005 427219.4 28254.0 15.12 < 2e-16
#> year_category2006-2010 510524.9 24421.0 20.91 < 2e-16
#> year_category2011-2015 549657.7 23882.6 23.02 < 2e-16
#> year_category2016-2024 490791.8 18754.8 26.17 < 2e-16
#>
#> (Intercept) ***
#> number_of_rooms **
#> square_meters ***
#> cantonAppenzell-Ausser-Rhoden
#> cantonAppenzell-Inner-Rhoden
#> cantonBasel-Landschaft ***
#> cantonBasel-Stadt ***
#> cantonBern ***
#> cantonFribourg ***
#> cantonGeneva ***
#> cantonGlarus **
#> cantonGrisons ***
#> cantonJura ***
#> cantonLucerne ***
#> cantonNeuchatel ***
#> cantonNidwalden ***
#> cantonObwalden ***
#> cantonSchaffhausen .
#> cantonSchwyz ***
#> cantonSolothurn ***
#> cantonSt-Gallen .
#> cantonThurgau **
#> cantonTicino *
#> cantonUri
#> cantonValais .
#> cantonVaud ***
#> cantonZug ***
#> cantonZurich ***
#> property_typeAttic flat **
#> property_typeBifamiliar house ***
#> property_typeChalet ***
#> property_typeDuplex ***
#> property_typeFarm house ***
#> property_typeLoft
#> property_typeRoof flat ***
#> property_typeRustic house
#> property_typeSingle house ***
#> property_typeTerrace flat
#> property_typeVilla *
#> year_category1919-1945 ***
#> year_category1946-1960 ***
#> year_category1961-1970 ***
#> year_category1971-1980 ***
#> year_category1981-1990 ***
#> year_category1991-2000 ***
#> year_category2001-2005 ***
#> year_category2006-2010 ***
#> year_category2011-2015 ***
#> year_category2016-2024 ***
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> Residual standard error: 922000 on 48028 degrees of freedom
#> Multiple R-squared: 0.557, Adjusted R-squared: 0.557
#> F-statistic: 1.26e+03 on 48 and 48028 DF, p-value: <2e-16
#show the result in the html
kable(summary(model)$coefficients, format = "html") %>%
kable_styling(full_width = F)| Estimate | Std. Error | t value | Pr(>|t|) | |
|---|---|---|---|---|
| (Intercept) | -571974 | 29124.7 | -19.639 | 0.000 |
| number_of_rooms | -664 | 232.1 | -2.861 | 0.004 |
| square_meters | 9671 | 48.7 | 198.677 | 0.000 |
| cantonAppenzell-Ausser-Rhoden | -49355 | 58483.4 | -0.844 | 0.399 |
| cantonAppenzell-Inner-Rhoden | 119077 | 119190.4 | 0.999 | 0.318 |
| cantonBasel-Landschaft | 119189 | 32646.7 | 3.651 | 0.000 |
| cantonBasel-Stadt | 550598 | 75191.0 | 7.323 | 0.000 |
| cantonBern | -143302 | 25791.4 | -5.556 | 0.000 |
| cantonFribourg | -138139 | 26509.2 | -5.211 | 0.000 |
| cantonGeneva | 1369130 | 37877.4 | 36.146 | 0.000 |
| cantonGlarus | -257328 | 97006.6 | -2.653 | 0.008 |
| cantonGrisons | 236240 | 45281.4 | 5.217 | 0.000 |
| cantonJura | -634396 | 43838.1 | -14.471 | 0.000 |
| cantonLucerne | 127651 | 37351.1 | 3.418 | 0.001 |
| cantonNeuchatel | -227592 | 35855.3 | -6.348 | 0.000 |
| cantonNidwalden | 446997 | 92461.7 | 4.834 | 0.000 |
| cantonObwalden | 813963 | 115379.6 | 7.055 | 0.000 |
| cantonSchaffhausen | -118840 | 69881.8 | -1.701 | 0.089 |
| cantonSchwyz | 430523 | 73325.5 | 5.871 | 0.000 |
| cantonSolothurn | -196911 | 35907.3 | -5.484 | 0.000 |
| cantonSt-Gallen | -54343 | 30354.3 | -1.790 | 0.073 |
| cantonThurgau | -83863 | 32028.8 | -2.618 | 0.009 |
| cantonTicino | 51018 | 22919.1 | 2.226 | 0.026 |
| cantonUri | 116303 | 86335.9 | 1.347 | 0.178 |
| cantonValais | -43448 | 22514.3 | -1.930 | 0.054 |
| cantonVaud | 374337 | 23673.3 | 15.813 | 0.000 |
| cantonZug | 1333286 | 62892.0 | 21.200 | 0.000 |
| cantonZurich | 431452 | 29063.2 | 14.845 | 0.000 |
| property_typeAttic flat | 71411 | 22532.7 | 3.169 | 0.002 |
| property_typeBifamiliar house | -287183 | 22370.6 | -12.838 | 0.000 |
| property_typeChalet | 229320 | 24921.8 | 9.202 | 0.000 |
| property_typeDuplex | -213773 | 29049.9 | -7.359 | 0.000 |
| property_typeFarm house | -463341 | 61245.2 | -7.565 | 0.000 |
| property_typeLoft | -193955 | 156278.0 | -1.241 | 0.215 |
| property_typeRoof flat | -112128 | 33006.5 | -3.397 | 0.001 |
| property_typeRustic house | -60336 | 124202.5 | -0.486 | 0.627 |
| property_typeSingle house | -224374 | 12767.8 | -17.573 | 0.000 |
| property_typeTerrace flat | 29724 | 46230.7 | 0.643 | 0.520 |
| property_typeVilla | -46324 | 20056.1 | -2.310 | 0.021 |
| year_category1919-1945 | 248288 | 32007.7 | 7.757 | 0.000 |
| year_category1946-1960 | 281496 | 29638.4 | 9.498 | 0.000 |
| year_category1961-1970 | 238845 | 24197.2 | 9.871 | 0.000 |
| year_category1971-1980 | 314981 | 21909.2 | 14.377 | 0.000 |
| year_category1981-1990 | 269025 | 22265.8 | 12.082 | 0.000 |
| year_category1991-2000 | 436391 | 23034.2 | 18.945 | 0.000 |
| year_category2001-2005 | 427219 | 28254.0 | 15.121 | 0.000 |
| year_category2006-2010 | 510525 | 24421.0 | 20.905 | 0.000 |
| year_category2011-2015 | 549658 | 23882.6 | 23.015 | 0.000 |
| year_category2016-2024 | 490792 | 18754.8 | 26.169 | 0.000 |
impot_cols <- names(properties_filtered)[startsWith(names(properties_filtered), "Impôt")]
# Count the number of NA values in selected columns
na_counts <- colSums(is.na(properties_filtered[impot_cols]))
# Print the counts
print(na_counts)
#> numeric(0)